#' ---
#' title: "ANES 2016 -> 2020 Analysis"
#' author: ""
#' date: "`r Sys.Date()`"
#' encoding: UTF-8
#' output:
#'   distill_article:
#'     toc: true
#'     toc_float: true
#'     toc_depth: 2
#' header-includes:
#'  - \usepackage{booktabs}
#'  - \usepackage{longtable}
#'  - \usepackage{array}
#'  - \usepackage{multirow}
#'  - \usepackage{wrapfig}
#'  - \usepackage{float}
#'  - \usepackage{colortbl}
#'  - \usepackage{pdflscape}
#'  - \usepackage{tabu}
#'  - \usepackage{threeparttable}
#'  - \usepackage{threeparttablex}
#'  - \usepackage[normalem]{ulem}
#'  - \usepackage{makecell}
#'  - \usepackage{dcolumn}
#'  - \usepackage{setspace}\doublespacing
#' ---

################################################
## ANES text analysis
################################################

## ---- spin_code, eval = FALSE, include = FALSE ---------------------------
# spin code to output Rmd
# set knit = FALSE to generate Rmd / Rnw but not compile

# rmarkdown::render(input = here::here("text_code", "anes2020_processing.R"), output_format = "pdf_document", clean = FALSE)

## see all chunks and clear all chunks
# knitr:::knit_code$get()      # check chunks in the current session
# knitr:::knit_code$restore()  # clean up the session

## ---- setup_global_options, include = FALSE -------------------
library(knitr)
# opts_chunk$set(fig.lp = "fig:", echo = FALSE, message = FALSE, warning = FALSE, error = FALSE, include = TRUE, dev = "cairo_pdf")

opts_chunk$set(
    fig.lp  = "fig:",
    echo    = FALSE,
    message = FALSE,
    warning = FALSE,
    error   = FALSE,
    dev     = c("cairo_pdf", "png"), 
    dpi     = 288

)


## ---- load-packages-abes2020, include = FALSE ----

source(here::here("text_code/text_packages_rep.R"))


# identify color palette
# scales::show_col(amerika_palette("Dem_Ind_Rep5"))


## ---- load_data ----

# ANES 2016 data
load(file = here("text_data_output", "anes2016_processed.Rdata"), verbose = TRUE)


# load ANES 2020 SPSS version with has labeled columns
# anes20 <- read_sav(file = here("text_data_raw", "ANES", "ICPSR_38034_SPSS", "DS0001", "38034-0001-Data.sav"))

# load ANES 2020 SPSS version with has labeled columns
anes20 <- read_dta(file = here("text_data_raw", "ANES", "anes_timeseries_2020_stata_20220210.dta"))

# haven::print_labels(anes20$V200003)

# Labels:
#     value                                       label
# 2                     2. ANES 2016-2020 Panel
# 3 3. 3Ar1 Fresh sample: web only, replicate 1
# 4 4. 3Ar2 Fresh sample: web only, replicate 2
# 5            5. 3B Fresh sample: web or phone
# 6    6. 3C Fresh sample: video, web, or phone


#dim(panel20)


# V201107 - PRE: What is it that R likes about Democratic Presidential candidate [text]
anes20_like_dem <- read_excel( #like dem
    here("text_data_raw", "ANES", "anes_timeseries_2020_redactedopenends_excel_20211118.xlsx"), 
    sheet = "V201107") %>% 
    clean_names() %>% 
    rename(text_dem20_like = v201107_pre_what_is_it_that_r_likes_about_democratic_presidential_candidate_text)

# V201109 - PRE: What is it that R dislikes about Democratic Presidential candidate [text]
anes20_dislike_dem <- read_excel( #dislike dem
    here("text_data_raw", "ANES", "anes_timeseries_2020_redactedopenends_excel_20211118.xlsx"), 
    sheet = "V201109") %>% 
    clean_names() %>% 
    rename(text_dem20_dislike = v201109_pre_what_is_it_that_r_dislikes_about_democratic_presidential_candidate_text)

# V201111 - PRE: What is it that R likes about Republican Presidential candidate [text]
anes20_like_rep <- read_excel( #like dem
    here("text_data_raw", "ANES", "anes_timeseries_2020_redactedopenends_excel_20211118.xlsx"), 
    sheet = "V201111") %>% 
    clean_names() %>% 
    rename(text_rep20_like = v201111_pre_what_is_it_that_r_likes_about_republican_presidential_candidate_text)

# V201113 - PRE: What is it that R dislikes about Republican Presidential candidate [text]
anes20_dislike_rep <- read_excel( #dislike rep
    here("text_data_raw", "ANES", "anes_timeseries_2020_redactedopenends_excel_20211118.xlsx"), 
    sheet = "V201113") %>% 
    clean_names() %>% 
    rename(text_rep20_dislike = v201113_pre_what_is_it_that_r_dislikes_about_republican_presidential_candidate_text)



# V202205 - POST: Most important problems facing the country - mention 1 [text]
anes20_prob20_1 <- read_excel( #dislike rep
    here("text_data_raw", "ANES", "anes_timeseries_2020_redactedopenends_excel_20211118.xlsx"), 
    sheet = "V202205") %>% 
    clean_names() %>% 
    rename(text_prob20_1 = v202205_post_most_important_problems_facing_the_country_mention_1_text)


# V202207 - POST: Most important problems facing the country - mention 2 [text]
anes20_prob20_2 <- read_excel( #dislike rep
    here("text_data_raw", "ANES", "anes_timeseries_2020_redactedopenends_excel_20211118.xlsx"), 
    sheet = "V202207") %>% 
    clean_names() %>% 
    rename(text_prob20_2 = v202207_post_most_important_problems_facing_the_country_mention_2_text)


# V202209 - POST: Most important problems facing the country - mention 3 [text]
anes20_prob20_3 <- read_excel( #dislike rep
    here("text_data_raw", "ANES", "anes_timeseries_2020_redactedopenends_excel_20211118.xlsx"), 
    sheet = "V202209") %>% 
    clean_names() %>% 
    rename(text_prob20_3 = v202209_post_most_important_problems_facing_the_country_mention_3_text)


# V202011 - POST: Most important problems facing the country - most important [text]
anes20_prob20_4 <- read_excel( #dislike rep
    here("text_data_raw", "ANES", "anes_timeseries_2020_redactedopenends_excel_20211118.xlsx"), 
    sheet = "V202011") %>% 
    clean_names() %>% 
    rename(text_prob20_4 = v202211_post_most_important_problems_facing_the_country_most_important_text)


text2020 <- anes20 %>% 
    left_join(anes20_like_dem,    by = c("V200001" = "v200001")) %>%
    left_join(anes20_dislike_dem, by = c("V200001" = "v200001")) %>%
    left_join(anes20_like_rep,    by = c("V200001" = "v200001")) %>%
    left_join(anes20_dislike_rep, by = c("V200001" = "v200001")) 

text2020 <- text2020 %>% 
    left_join(anes20_prob20_1,       by = c("V200001" = "v200001")) %>%
    left_join(anes20_prob20_2,       by = c("V200001" = "v200001")) %>%
    left_join(anes20_prob20_3,       by = c("V200001" = "v200001")) %>%
    left_join(anes20_prob20_4,       by = c("V200001" = "v200001")) 

# nchar_prob <-
#     function(problem_text) {
#         case_when(
#             is.na(problem_text)  ~ 0, #NA_integer_,
#             problem_text == "-6" ~ 0, #NA_integer_,
#             problem_text == "-7" ~ 0, #NA_integer_,
#             problem_text == "-8" ~ 0, # Don't know
#             problem_text == "-9" ~ 0,  #NA_integer_, # 0, # Refused
#             TRUE ~ nchar(problem_text)
#         ) 
#     }


nchar20 <- function(x) {
    x <- case_when(
        is.na(x)   ~ 0,
        x == "-1"  ~ 0,
        x == "-5"  ~ 0,
        x == "-6"  ~ 0,
        x == "-7"  ~ 0,
        x == "-8"  ~ 0,
        x == "-9"  ~ 0,
        TRUE ~ nchar(x)
    )
    return(x)
}

text_vars <- c("text_dem20_like", "text_dem20_dislike", "text_rep20_like", "text_rep20_dislike",
               "text_prob20_1", "text_prob20_2", "text_prob20_3", "text_prob20_4")

text2020 <- text2020 %>%
    mutate(across(all_of(text_vars), ~ nchar20(.), .names = "{.col}_nchar"))

# text2020_nchar <- text2020 %>%
    # mutate(across(-v200001, ~ nchar20(.), .names = "{.col}_nchar"))

# load 2020 validated vote
vv20 <- read_csv(here("text_data_raw/ANES/anes_timeseries_2020_csv_VoterValidation.csv"))

#dim(vv20)


# time2020pre <- read_csv(here("text_data_raw/ANES/anes_timeseries_2020_questiontimings_preelection_csv_20210719.csv"))
# 
# time2020post <- read_csv(here("text_data_raw/ANES/anes_timeseries_2020_questiontimings_postelection_csv_20210719.csv"))

# time2020pre <- time2020pre %>%
#     rename_with(~ paste0(make_clean_names(.) , "_pre"))
# 
# time2020post <- time2020post %>%
#     rename_with(~ paste0(make_clean_names(.) , "_post"))


# pull observations recorded in 2016 and 2020
panel20 <- text2020 #%>% filter(V200003 == 2)


# loading in anes data via R data
# load( #all non-open-ended data
#     file = here("text_data_raw", "ANES", "ICPSR_38034_R", "DS0001", "38034-0001-Data.rda"), verbose = TRUE) 


## ---- merge-data ----

#panel20_text <- left_join(panel20, text2020_nchar, by = c("V200001" = "v200001"))

panel20vvtt <- panel20 %>% 
    left_join(vv20, by = "V200001") #%>% 
    #left_join(time2020pre,  by = c("V200001" = "case_id_pre")) %>% 
    #left_join(time2020post, by = c("V200001" = "case_id_post"))

#dim(panel20)
#dim(panel20vv)

# combine 2016 and 2020
a20 <- left_join(panel20vvtt, a16, by = c("V160001_orig" = "V160001_orig"))

#a20 <- left_join(anes, text2020_nchar, by = c("V160001_orig" = "V160001_ORIG"))




## ---- recode-data ----

a20 <- a20 %>% 
    mutate(
        # ideo7_20 = case_when(
        #     V201202 > 0 ~ V201202,
        #     TRUE ~ NA_real_
        # ),
        # 
        reg20_fct = case_when(
            V201025x == 1  ~ "Not Reg, No Intent",
            V201025x == 2  ~ "Not Reg, Intent",
            V201025x == 3  ~ "Reg, Not Early",
            V201025x == 4  ~ "Reg, Early",
            TRUE ~ NA_character_
        ) %>% as.factor(),
        reg20_bin = case_when(
            V201025x <= 2  ~ 0,
            V201025x >= 3  ~ 1,
            TRUE ~ NA_real_
        ),
        reg_loc20_bin = case_when(
            V201008 == 1  ~ 1, # Among those who voted 88.8% were registered at current address
            V201008 == 2 | V201008 == 3 ~ 0
        ),
        reg_loc20_fct = case_when(
            V201008 == 1 ~ "Registered", # Among those who voted 88.8% were registered at current address
            V201008 == 2 ~ "Registered Elsewhere",
            V201008 == 3 ~ "Not Registered"
        ),
        mode_pre20 = case_when(
            V200002 == 1 ~ "video",
            V200002 == 2 ~ "tele",
            V200002 == 3 ~ "web"
        ) %>% as.factor(),
        pid3_20 = ifelse(V201228 > 0, V201228, NA_real_),
        pid3_20_fct = case_when(
            pid3_20 == 1 ~ "Dem",
            pid3_20 == 2 ~ "Rep",
            pid3_20 == 3 ~ "Ind",
            TRUE ~ NA_character_
        ) %>% fct_relevel("Ind"),
        pid4_20_fct = case_when(
            pid3_20 == 1 ~ "Dem",
            pid3_20 == 2 ~ "Rep",
            pid3_20 == 3 ~ "Ind",
            pid3_20 == 5 ~ "Oth",
            TRUE ~ NA_character_
        ) %>% fct_relevel("Ind")
        
    )

        
a20 <- a20 %>% 
    mutate(
        # identify subjects who switch Party ID
        switcher_bin = case_when(
            pid3_16 == "Dem" & (pid3_20 == 2 | pid3_20 == 3 | pid3_20 == 5) ~ 1, 
            pid3_16 == "Rep" & (pid3_20 == 1 | pid3_20 == 3 | pid3_20 == 5) ~ 1, 
            pid3_16 == "Ind" & (pid3_20 == 1 | pid3_20 == 2 | pid3_20 == 5) ~ 1, 
            TRUE ~ 0),
        switcher_fct = case_when(
            pid3_16 == "Dem" & (pid3_20 == 1) ~ "Dem16->Dem20", 
            pid3_16 == "Dem" & (pid3_20 == 2) ~ "Dem16->Rep20", 
            pid3_16 == "Dem" & (pid3_20 == 3) ~ "Dem16->Ind20", 
            pid3_16 == "Rep" & (pid3_20 == 2) ~ "Rep16->Rep20", 
            pid3_16 == "Rep" & (pid3_20 == 1) ~ "Rep16->Dem20", 
            pid3_16 == "Rep" & (pid3_20 == 3) ~ "Rep16->Ind20", 
            pid3_16 == "Ind" & (pid3_20 == 3) ~ "Ind16->Ind20", 
            pid3_16 == "Ind" & (pid3_20 == 1) ~ "Ind16->Dem20", 
            pid3_16 == "Ind" & (pid3_20 == 2) ~ "Ind16->Rep20", 
            TRUE ~ NA_character_),
        switcher2_20_fct = case_when(
            pid3_16 == "Dem" & (pid3_20 == 1) ~ "Dem16->Dem20", 
            pid3_16 == "Dem" & (pid3_20 != 1) ~ "Dem16->!Dem20", 
            #pid3_16 == "Dem" & (pid3_20 == 3) ~ "Dem16->!Dem20", 
            pid3_16 == "Rep" & (pid3_20 == 2) ~ "Rep16->Rep20", 
            pid3_16 == "Rep" & (pid3_20 != 2) ~ "Rep16->!Rep20", 
            #pid3_16 == "Rep" & (pid3_20 == 3) ~ "Rep16->!Rep20", 
            pid3_16 == "Ind" & (pid3_20 == 3) ~ "Ind16->Ind20", 
            pid3_16 == "Ind" & (pid3_20 != 3) ~ "Ind16->!Ind20", 
            #pid3_16 == "Ind" & (pid3_20 == 2) ~ "Ind16->!Ind20", 
            TRUE ~ NA_character_) %>% 
            forcats::fct_relevel(c("Ind16->Ind20")),
        stayer_dem_bin = case_when(
            pid3_16 == "Dem" & (pid3_20 == 1) ~ 1, 
            pid3_16 == "Dem" & (pid3_20 != 1) ~ 0, 
            TRUE ~ NA_real_),
        switcher_dem = case_when(
            #pid3_16 == "Dem" & (pid3_20 == 2 | pid3_20 == 3 | pid3_20 == 5 |  pid3_20 == 0) ~ 1,
            pid3_16 == "Dem" & (pid3_20 != 1) ~ 1, 
            
            pid3_16 == "Rep" ~ NA_real_, 
            pid3_16 == "Ind" ~ NA_real_, 
            TRUE ~ 0),
        switcher_dem_fct = case_when(
            pid3_16 == "Dem" & (pid3_20 == 1) ~ "Dem16->Dem20",
            pid3_16 == "Dem" & (pid3_20 == 2) ~ "Dem16->Rep20", 
            pid3_16 == "Dem" & (pid3_20 == 3) ~ "Dem16->Ind20", 
            #pid3_16 == "Dem" & (pid3_20 != 1 & pid3_20 != 2 & pid3_20 != 3) ~ "Dem->Oth", 
            TRUE ~ NA_character_) %>% 
            forcats::fct_relevel(c("Dem16->Dem20", "Dem16->Ind20", "Dem16->Rep20")),
        
        stayer_rep_bin = case_when(
            pid3_16 == "Rep" & (pid3_20 == 2) ~ 1, 
            pid3_16 == "Rep" & (pid3_20 != 2) ~ 0, 
            TRUE ~ NA_real_),
        switcher_rep = case_when(
            #pid3_16 == "Dem" & (pid3_20 == 2 | pid3_20 == 3 | pid3_20 == 5 |  pid3_20 == 0) ~ 1,
            pid3_16 == "Rep" & (pid3_20 != 2) ~ 1, 
            pid3_16 == "Dem" ~ NA_real_, 
            pid3_16 == "Ind" ~ NA_real_, 
            TRUE ~ 0),
        switcher_rep_fct = case_when(
            pid3_16 == "Rep" & (pid3_20 == 2) ~ "Rep16->Rep20",
            pid3_16 == "Rep" & (pid3_20 == 1) ~ "Rep16->Dem20", 
            pid3_16 == "Rep" & (pid3_20 == 3) ~ "Rep16->Ind20", 
#            pid3_16 == "Rep" & (pid3_20 != 1 & pid3_20 != 2 & pid3_20 != 3) ~ "Rep->Oth", 
            TRUE ~ NA_character_) %>% forcats::fct_relevel(c("Rep16->Rep20", "Rep16->Ind20", "Rep16->Dem20")),
        
        switcher_ind = case_when(
            #pid3_16 == "Ind" & (pid3_20 == 1 | pid3_20 == 2) ~ 1,
            pid3_16 == "Ind" & (pid3_20 != 3) ~ 1, 
            pid3_16 == "Dem" ~ NA_real_, 
            pid3_16 == "Rep" ~ NA_real_, 
            TRUE ~ 0),
        switcher_ind_fct = case_when(
            pid3_16 == "Ind" & (pid3_20 == 1) ~ "Ind16->Dem20",
            pid3_16 == "Ind" & (pid3_20 == 2) ~ "Ind16->Rep20", 
            pid3_16 == "Ind" & (pid3_20 == 3) ~ "Ind16->Ind20", 
            #pid3_16 == "Ind" & (pid3_20 != 1 & pid3_20 != 2 & pid3_20 != 3) ~ "Ind->Oth", 
            TRUE ~ NA_character_) %>% 
            forcats::fct_relevel(c("Ind16->Dem20", "Ind16->Ind20", "Ind16->Rep20")),

        # switcher_scale = case_when(
        #     pid3_16 == "Dem" & (pid3_20 == 2 | pid3_20 == 3 | pid3_20 == 5) ~ 1, 
        #     pid3_16 == "Rep" & (pid3_20 == 1 | pid3_20 == 3 | pid3_20 == 5) ~ 1, 
        #     pid3_16 == "Ind" & (pid3_20 == 1 | pid3_20 == 2 | pid3_20 == 5) ~ 1, 
        #     TRUE ~ 0),
        pid3_20_fct = case_when(
            pid3_20 == 1 ~ "Dem",
            pid3_20 == 2 ~ "Rep",
            pid3_20 == 3 ~ "Ind",
            #pid3_20 == 5 ~ "Other",
            TRUE ~ NA_character_
        ),
        pid3_20_scale = case_when(
            pid3_20 == 1 ~ -1,
            pid3_20 == 2 ~ +1,
            pid3_20 == 3 ~  0,
            #pid3_20 == 5 ~ "Other",
            TRUE ~ NA_real_
        ),
        pid3_16_scale = case_when(
            pid3_16 == "Dem" ~ -1,
            pid3_16 == "Rep" ~ +1,
            pid3_16 == "Ind" ~  0,
            TRUE ~ NA_real_
        ),
        
    )


a20 <- a20 %>% 
    mutate(
        ft_biden   = ifelse(V201151 >= 0, V201151, NA_real_),
        ft_trump20 = ifelse(V201152 >= 0, V201152, NA_real_),
        ft_trump_biden = ft_trump20 - ft_biden
    )

a20 <- a20 %>% 
    mutate(
        rr_favors = case_when(
            V202300 == 1 ~ 5, 
            V202300 == 2 ~ 4,
            V202300 == 3 ~ 3,
            V202300 == 4 ~ 2,
            V202300 == 5 ~ 1,
            TRUE ~ as.numeric(NA)
        ),
        rr_slavery = case_when(
            V202301 == 1 ~ 1, 
            V202301 == 2 ~ 2,
            V202301 == 3 ~ 3,
            V202301 == 4 ~ 4,
            V202301 == 5 ~ 5,
            TRUE ~ as.numeric(NA)
        ),
        rr_deserve = case_when(
            V202302 == 1 ~ 1, 
            V202302 == 2 ~ 2,
            V202302 == 3 ~ 3,
            V202302 == 4 ~ 4,
            V202302 == 5 ~ 5,
            TRUE ~ as.numeric(NA)
        ),
        rr_harder = case_when(
            V202303 == 1 ~ 5,
            V202303 == 2 ~ 4,
            V202303 == 3 ~ 3,
            V202303 == 4 ~ 2,
            V202303 == 5 ~ 1,
            TRUE ~ as.numeric(NA)
        )
    )

a20 <- a20 %>% 
    mutate(
        racial_resent20 = rowMeans(select(., rr_favors, rr_slavery, rr_deserve, rr_harder), na.rm = TRUE)
    )

#     glm(nchar_align20_ihs ~ racial_resent20 * pid3_20_fct  + ft_trump_biden + female20 + race20_fct + ideo7_16 + educ16 + age16 + income16 + pol_attn16, data = ., family = gaussian)


a20 <- a20 %>% 
    mutate(
        mode20   = V200002, # no variaton, all web,
        female20 = case_when(
            V201600 == 2 ~ "Yes",
            TRUE ~ "No"
        ) %>% as.factor(),
        ideo7_20 = case_when(
            V201200 >= 1  & V201200 <= 3 ~ V201200, # liberal
            V201200 >= 5  & V201200 <= 7 ~ V201200, # conservative
            V201200 == 4  & V201201 == 1 ~ 3, # slightly liberal
            V201200 == 99 & V201201 == 1 ~ 2, # modeerately liberal    
            V201200 == 4  & V201201 == 2 ~ 5, # slightly conservative    
            V201200 == 99 & V201201 == 2 ~ 6, # moderately conservative    
            V201200 == 99 & V201201 == 3 ~ 4, # moderate    
            V201200 == 4                 ~ 4  # moderate
    )
    )


a20 <- a20 %>% 
    mutate(
        educ20 = case_when(
            V201510 >= 1 & V201510 <= 8 ~ V201510,
            TRUE ~ NA_real_,
        ),
        income20 = case_when(
            V201617x >= 1 & V201617x <= 22 ~ V201617x,
            TRUE ~ NA_real_,
        ),
        pol_attn20 = case_when(
            V201005 >= 1 ~ V201005,
            TRUE ~ NA_real_
        ),
        age20 = case_when(
            V201507x >= 1 ~ V201507x,
            TRUE ~ NA_real_
        )
    )
            


a20 <- a20 %>%
    mutate(
        pol_correct20_fct = case_when(
            V201626 == 1 ~ "High Concern for Offense",
            V201626 == 2 ~ "Moderate Concern for Offense",
            V201626 == 3 ~ "Moderate Dismissal of Offense",
            V201626 == 4 ~ "Strong Dismissal of Offense",
            TRUE ~ NA_character_
        ) %>% forcats::fct_relevel("Strong Dismissal of Offense", "Moderate Dismissal of Offense", "Moderate Concern for Offense", "High Concern for Offense"),
        pol_correct20_fct4 = case_when(
            V201626 == 1 ~ "Norm Concern: High",
            V201626 == 2 ~ "Norm Concern: Mod-High",
            V201626 == 3 ~ "Norm Concern: Mod-Low",
            V201626 == 4 ~ "Norm Concern: Low",
            TRUE ~ NA_character_
        ) %>% forcats::fct_relevel("Norm Concern: Low", "Norm Concern: Mod-Low", "Norm Concern: Mod-High", "Norm Concern: High"),
        pol_correct20_fct3 = case_when(
            V201626 == 1 ~ "Norm Concern: High",
            V201626 == 2 ~ "Norm Concern: Moderate",
            V201626 == 3 ~ "Norm Concern: Moderate",
            V201626 == 4 ~ "Norm Concern: Low",
            TRUE ~ NA_character_
        ) %>% forcats::fct_relevel("Norm Concern: Low", "Norm Concern: Moderate", "Norm Concern: High"),
        pol_correct20_fct2 = case_when(
            V201626 == 1 | V201626 == 2 ~ "Norm Concern: High",
            V201626 == 3 | V201626 == 4 ~ "Norm Concern: Low",
            TRUE ~ NA_character_
        ) %>% forcats::fct_relevel("Norm Concern: Low", "Norm Concern: High"),
        pol_correct20_num = case_when(
            V201626 >= 1 ~ V201626,
            TRUE ~ NA_real_
        ),
        pol_correct20_num_fct = case_when(
            V201626 == 1 ~ "4 - High",
            V201626 == 2 ~ "3",
            V201626 == 3 ~ "2",
            V201626 == 4 ~ "1 - Low",
            TRUE ~ NA_character_
        ) %>% forcats::fct_relevel("1 - Low", "2", "3", "4 - High")
        
    )


        
a20 <- a20 %>% 
    mutate(
        race20_fct = case_when(
            V201549x == 1 ~ "White",
            V201549x == 2 ~ "Black",
            V201549x == 3 ~ "Hispanic",
            V201549x == 4 ~ "Asian",
            # V201549x == 5 ~ "Native",
            # V201549x == 6 ~ "Multiple",
            # V201549x <  0 ~ "Refused-DK",
            TRUE ~ "Other"
        ) %>% as.factor(),
        racechg_fct = case_when(
            race16 == "white"    & race20_fct == "White"    ~ "White16->White20",
            race16 == "white"    & race20_fct != "White"    ~ "White16->!White20",
            race16 == "black"    & race20_fct == "Black"    ~ "Black16->Black20",
            race16 == "black"    & race20_fct != "Black"    ~ "Black16->!Black20",
            race16 == "hispanic" & race20_fct == "Hispanic" ~ "Hispanic16->Hispanic20",
            race16 == "hispanic" & race20_fct != "Hispanic" ~ "Hispanic16->!Hispanic20",
            race16 == "asian"    & race20_fct == "Asian"    ~ "Asian16->Asian20",
            race16 == "asian"    & race20_fct != "Asian"    ~ "Asian16->!Asian20",
            # (race16 == "native_american" | race16 == "other")  & race20_fct == "Native"   ~ "Nativ16->Native20",
            # race16 == "white"    & race20_fct == "Multiple" ~ "White16->Multiple20",
            # race16 == "native_american"    & race20_fct == "Native"   ~ "Nativ16->Native20",
            TRUE ~ NA_character_
        ) %>% as.factor() %>% 
            relevel(ref = "White16->White20"),
        racechg2_fct = case_when(
            race16 == "white"    & race20_fct == "White"    ~ "White16->White20",
            race16 == "white"    & race20_fct != "White"    ~ "White16->!White20",
            race16 == "black"    & race20_fct == "Black"    ~ "PoC16->PoC20",
            race16 == "black"    & race20_fct != "Black"    ~ "PoC16->!PoC20",
            race16 == "hispanic" & race20_fct == "Hispanic" ~ "PoC16->PoC20",
            race16 == "hispanic" & race20_fct != "Hispanic" ~ "PoC16->!PoC20",
            race16 == "asian"    & race20_fct == "Asian"    ~ "PoC16->PoC20",
            race16 == "asian"    & race20_fct != "Asian"    ~ "PoC16->!PoC20",
            # (race16 == "native_american" | race16 == "other")  & race20_fct == "Native"   ~ "Nativ16->Native20",
            # race16 == "white"    & race20_fct == "Multiple" ~ "White16->Multiple20",
            # race16 == "native_american"    & race20_fct == "Native"   ~ "Nativ16->Native20",
            TRUE ~ NA_character_
        ) %>% as.factor() %>% 
            relevel(ref = "White16->White20"),
        racechg_wht_fct = case_when(
            race16 == "white" & race20_fct == "White"    ~ "White16->White20",
            race16 == "white" & race20_fct == "Black"    ~ "White16->Black20",
            race16 == "white" & race20_fct == "Hispanic" ~ "White16->Hispanic20",
            race16 == "white" & race20_fct == "Asian"    ~ "White16->Asian20",
            race16 == "white" & race20_fct == "Native"   ~ "White16->Native20",
            race16 == "white" & race20_fct == "Multiple" ~ "White16->Multiple20",
            TRUE ~ NA_character_
        ),
        racechg_hisp_fct = case_when(
            race16 == "hispanic" & race20_fct == "White"    ~ "Hispanic16->White20",
            race16 == "hispanic" & race20_fct == "Black"    ~ "Hispanic16->Black20",
            race16 == "hispanic" & race20_fct == "Hispanic" ~ "Hispanic16->Hispanic20",
            #race16 == "hispanic" & race20_fct == "Asian"    ~ "Hispanic16->Asian20",
            #race16 == "hispanic" & race20_fct == "Native"   ~ "Hispanic16->Native20",
            #race16 == "hispanic" & race20_fct == "Multiple" ~ "Hispanic16->Multiple20",
            TRUE ~ NA_character_
        ),
        racechg_blk_fct = case_when(
            race16 == "black" & race20_fct == "White"    ~ "Black16->White20",
            race16 == "black" & race20_fct == "Black"    ~ "Black16->Black20",
            race16 == "black" & race20_fct == "Hispanic" ~ "Black16->Hispanic20",
            race16 == "black" & race20_fct == "Multiple" ~ "Black16->Multiple20",
            #race16 == "hispanic" & race20_fct == "Asian"    ~ "Hispanic16->Asian20",
            #race16 == "hispanic" & race20_fct == "Native"   ~ "Hispanic16->Native20",
            TRUE ~ NA_character_
        ),
        racechg_blk = case_when(
            race16 == "black" & race20_fct != "Black" ~ 1,
            race16 == "black" & race20_fct == "Black" ~ 0,
            TRUE ~ NA_real_
        ),
        racechg_hisp = case_when(
            race16 == "hispanic" & race20_fct != "Hispanic" ~ 1,
            race16 == "hispanic" & race20_fct == "Hispanic" ~ 0,
            TRUE ~ NA_real_
        ),
        racechg_asian = case_when(
            race16 == "asian" & race20_fct != "Asian" ~ 1,
            race16 == "asian" & race20_fct == "Asian" ~ 0,
            TRUE ~ NA_real_
        ),
        racechg_asian = case_when(
            race16 == "native_american" & race20_fct != "Native" ~ 1,
            race16 == "native_american" & race20_fct == "Native" ~ 0,
            TRUE ~ NA_real_
        ),
        # Total number of changes (some people may only have one race16 flag, so use rowSums with na.rm=TRUE)
        #racechg_tot = rowSums(across(starts_with("racechg_")), na.rm = TRUE),
        
        vote20 = case_when(
            V202073 == 1 ~ "Biden",
            V202073 == 2 ~ "Trump",
            V202073 == 3 ~ "Jorgensen",
            V202073 == 4 ~ "Hawkins",
            V202073 == 5 ~ "Other",
            TRUE ~ NA_character_
        ),
        vote_biden20_bin = case_when(
            V202073 == 1 ~ 1,
            TRUE ~ 0
        ),
        vote_trump20_bin = case_when(
            V202073 == 2 ~ 1,
            TRUE ~ 0
        ),
        vote20_fct = case_when(
            V202073 == 1 ~ "Biden",
            V202073 == 2 ~ "Trump",
            TRUE ~ NA_character_
        )
    )



a20 <- a20 %>% 
    mutate(
        # reg_loc20_bin = case_when(
        #     V201008 == 1 | V201008 == 2 ~ 1,
        #     TRUE ~ 0
        # ),
        likely_vote20 = case_when(
            V201100 >= 1 & V201100 <= 5 ~ 6 - V201100,
            TRUE ~ NA_real_
        )
    )




        
a20 <- a20 %>% 
    mutate(
        nonresp20_lddr       = 
            as.numeric(text_dem20_like_nchar == 0) + 
            as.numeric(text_rep20_dislike_nchar == 0),
        
        nonresp20_lrdd       = 
            as.numeric(text_rep20_like_nchar == 0) + 
            as.numeric(text_dem20_dislike_nchar == 0),
        
        nonresp20_all        = (nonresp20_lrdd - nonresp20_lddr),
        
        nonresp20_lddr_bin   = as.numeric(text_dem20_like_nchar == 0 | text_rep20_dislike_nchar == 0),
        nonresp20_lrdd_bin   = as.numeric(text_rep20_like_nchar == 0 | text_dem20_dislike_nchar == 0),
        
        nchar20_like_dem_ihs    = asinh(text_dem20_like_nchar),
        nchar20_dislike_dem_ihs = asinh(text_dem20_dislike_nchar),
        nchar20_like_rep_ihs    = asinh(text_rep20_like_nchar),
        nchar20_dislike_rep_ihs = asinh(text_rep20_dislike_nchar),
        
    )


a20 <- a20 %>% 
    group_by(mode_pre20) %>% 
    mutate(
        max_nchar20_like_dem_ihs     = max(nchar20_like_dem_ihs),
        max_nchar20_dislike_dem_ihs  = max(nchar20_dislike_dem_ihs),
        max_nchar20_like_rep_ihs     = max(nchar20_like_rep_ihs),
        max_nchar20_dislike_rep_ihs  = max(nchar20_dislike_rep_ihs)
    ) %>% 
    ungroup()


a20 <- a20 %>% 
    group_by(mode_pre20) %>% 
    mutate(
        nchar20_lddr_ihs = 
            (nchar20_like_dem_ihs/max_nchar20_like_dem_ihs) + 
            (nchar20_dislike_rep_ihs/max_nchar20_dislike_rep_ihs),
        
        nchar20_lrdd_ihs = 
            (nchar20_like_rep_ihs/max_nchar20_like_rep_ihs) + 
            (nchar20_dislike_dem_ihs/max_nchar20_dislike_dem_ihs),
        
        nchar_align20_ihs = (nchar20_lrdd_ihs - nchar20_lddr_ihs)/2,
    ) %>% 
    ungroup()



## Most Important Problem
a20 <- a20 %>% 
    #group_by(mode_pre20) %>% 
    mutate(
        text_prob20_1_ihs = asinh(text_prob20_1_nchar),
        text_prob20_2_ihs = asinh(text_prob20_2_nchar),
        text_prob20_3_ihs = asinh(text_prob20_3_nchar),
        text_prob20_4_ihs = asinh(text_prob20_4_nchar)
    ) #%>% 
    #ungroup()

a20 <- a20 %>% 
    #group_by(mode_pre20) %>% 
    mutate(
        text_prob20_1_bin = as.numeric(text_prob20_1_nchar > 0),
        text_prob20_2_bin = as.numeric(text_prob20_2_nchar > 0),
        text_prob20_3_bin = as.numeric(text_prob20_3_nchar > 0),
        text_prob20_4_bin = as.numeric(text_prob20_4_nchar > 0)
    ) %>%
    mutate(text_prob20_bin = text_prob20_1_bin + text_prob20_2_bin + text_prob20_3_bin + text_prob20_4_bin)


a20 <- a20 %>% 
    group_by(mode_pre20) %>% 
    mutate(
        max_text_prob20_1_ihs = max(text_prob20_1_ihs),
        max_text_prob20_2_ihs = max(text_prob20_2_ihs),
        max_text_prob20_3_ihs = max(text_prob20_3_ihs),
        max_text_prob20_4_ihs = max(text_prob20_4_ihs)
    ) %>% 
    ungroup()


a20 <- a20 %>% 
    #group_by(mode_pre20) %>% 
    mutate(
        text_prob20_1_ihs_norm = (text_prob20_1_ihs/max_text_prob20_1_ihs), 
        text_prob20_2_ihs_norm = (text_prob20_2_ihs/max_text_prob20_2_ihs), 
        text_prob20_3_ihs_norm = (text_prob20_3_ihs/max_text_prob20_3_ihs), 
        text_prob20_4_ihs_norm = (text_prob20_4_ihs/max_text_prob20_4_ihs)
    ) #%>% 
    #ungroup()


a20 <- a20 %>% 
    group_by(mode_pre20) %>% 
    mutate(
        max_text_prob20_1_nchar = max(text_prob20_1_nchar),
        max_text_prob20_2_nchar = max(text_prob20_2_nchar),
        max_text_prob20_3_nchar = max(text_prob20_3_nchar),
        max_text_prob20_4_nchar = max(text_prob20_4_nchar)
    ) %>% 
    ungroup()


a20 <- a20 %>% 
    #group_by(mode_pre20) %>% 
    mutate(
        text_prob20_1_nchar_norm = (text_prob20_1_nchar/max_text_prob20_1_nchar), 
        text_prob20_2_nchar_norm = (text_prob20_2_nchar/max_text_prob20_2_nchar), 
        text_prob20_3_nchar_norm = (text_prob20_3_nchar/max_text_prob20_3_nchar), 
        text_prob20_4_nchar_norm = (text_prob20_4_nchar/max_text_prob20_4_nchar)
    ) #%>% 

a20 <- a20 %>% 
    mutate(
        nchar_problems20_ihs = text_prob20_1_ihs_norm + text_prob20_2_ihs_norm + text_prob20_3_ihs_norm + text_prob20_4_ihs_norm,

        nchar_prob20_all_ihs2 = text_prob20_1_ihs + text_prob20_2_ihs + text_prob20_3_ihs + text_prob20_4_ihs,
        
        nchar_prob20_tot = text_prob20_1_nchar + text_prob20_2_nchar + text_prob20_3_nchar + text_prob20_4_nchar,
        
        nchar_prob20_tot_ihs = asinh(nchar_prob20_tot),

        nchar_prob20_bin = as.numeric(nchar_prob20_tot > 0),
        
        nchar_prob20_norm_tot = text_prob20_1_nchar_norm + text_prob20_2_nchar_norm + text_prob20_3_nchar_norm + text_prob20_4_nchar_norm,
        
        nchar_prob20_norm_tot_ihs = asinh(nchar_prob20_norm_tot)        
    )



a20 <- a20 %>%
    mutate(
        nchar_problems20_bin = case_when(
            nchar_problems20_ihs > 0 ~ "1",
            TRUE ~ "0"
        ) %>% as.factor(),
        
        # nchar_problems20_fct = 
        #     case_when(
        #         nchar_problems20_ihs == 0 ~ as.factor("0"), 
        #         TRUE ~ santoku::chop_equally(round(nchar_problems20_ihs,1), groups = 4) 
            # ),
        
      n_answered20 = rowSums(
                select(., text_prob20_1_nchar, text_prob20_2_nchar, text_prob20_3_nchar, text_prob20_4_nchar) != 0, na.rm = TRUE
            )
      
        ) 


# create ordered categorical version of nchar_problems20_ihs
a20 <- a20 |> 
    mutate(
        nchar_problems20_fct = {
            non_zero <- nchar_problems20_ihs[nchar_problems20_ihs > 0]
            breaks <- quantile(non_zero, probs = c(0, 0.25, 0.5, 0.75, 1))
            
            case_when(
                nchar_problems20_ihs == 0 ~ "0",
                TRUE ~ as.character(cut(nchar_problems20_ihs, breaks = breaks, include.lowest = TRUE))
            )
        },
        nchar_problems20_fct = factor(nchar_problems20_fct)
    ) |> 
    
    mutate(
        nchar_problems20_fct = {
            lvls <- levels(nchar_problems20_fct)
            # Put "0" first, then sort the rest by extracting the lower bound
            non_zero_lvls <- lvls[lvls != "0"]
            # Extract first number from each label
            lower_bounds <- as.numeric(gsub("^[\\[\\(]([0-9.]+).*", "\\1", non_zero_lvls))
            non_zero_lvls <- non_zero_lvls[order(lower_bounds)]
            
            factor(nchar_problems20_fct, levels = c("0", non_zero_lvls))
        }
    )

# a20 <- a20 %>%
#     group_by(mode_pre20) %>%
#     mutate(
#         text_prob20_1_ihs_norm = text_prob20_1_ihs / max(text_prob20_1_ihs, na.rm = TRUE),
#         text_prob20_2_ihs_norm = text_prob20_2_ihs / max(text_prob20_2_ihs, na.rm = TRUE),
#         text_prob20_3_ihs_norm = text_prob20_3_ihs / max(text_prob20_3_ihs, na.rm = TRUE),
#         text_prob20_4_ihs_norm = text_prob20_4_ihs / max(text_prob20_4_ihs, na.rm = TRUE)
#     ) %>%
#     ungroup()
# 
# a20 <- a20 %>%
#     mutate(nchar_problems20_ihs = text_prob20_1_ihs_norm +
#                text_prob20_2_ihs_norm +
#                text_prob20_3_ihs_norm +
#                text_prob20_4_ihs_norm)



# Exploratory plots moved to analysis Rmd
# a20 %>%
#     ggplot(aes(y = nchar_problems20_ihs, x = nchar_prob20_tot_ihs, color = n_answered20)) +
#     geom_jitter(width = .1, height = .2, alpha = .2)


a20 <- a20 %>% 
    mutate(
        #prob20_fct = santoku::chop_equally(nchar_problems20_ihs, 5),
        prob20_fct = case_when(
            nchar_prob20_tot == 0 ~ 0,
            nchar_prob20_tot >   0 & nchar_prob20_tot <= 46 ~ 1,
            nchar_prob20_tot >  45 & nchar_prob20_tot < 108 ~ 2,
            nchar_prob20_tot > 108 & nchar_prob20_tot < 170 ~ 3,
            nchar_prob20_tot > 170 & nchar_prob20_tot < 210 ~ 4,
            TRUE ~ 5
        ) %>% as.factor(),
        prob20_bin = case_when(
            nchar_prob20_tot == 0 ~ 0,
            TRUE ~ 1
        )            
        # prob20_fct = case_when(
        #     nchar_problems20_ihs == 0 ~ "0",
        #     nchar_problems20_ihs > 0 ~ santoku::chop_equally(nchar_problems20_ihs, 10),
        #prob20_fct = santoku::chop_deciles(nchar_problems20_ihs)
    )            
    #)




a20 <- a20 %>% 
    mutate(
        turnout20_fct = case_when(
            val1_turnout20 == 1 ~ "linkage not attempted",
            val1_turnout20 == 2 ~ "no record found",
            val1_turnout20 == 3 ~ "linked, not voting",
            val1_turnout20 == 4 ~ "linked, voted absentee",
            val1_turnout20 == 5 ~ "linked, voted early",
            val1_turnout20 == 6 ~ "linked, voted in person",
        ),
        vote_validated20_bin = case_when(
            val1_turnout20 == 3 ~ 0,
            val1_turnout20 >  3 ~ 1,
            TRUE ~ NA_real_
    )
    )

a20 <- a20 %>%
    mutate(
        match_ok = case_when(
            val2_match == 1 ~ 1,              # Best available match signal
            val2_match == 0 ~ 0,
            !is.na(val1_matchprob) ~ val1_matchprob, # fallback probability
            TRUE ~ NA_real_
        )
    )


a20 <- a20 %>%
    mutate(vote_valid20_weighted = vote_validated20_bin * match_ok )


# rescale ideo7_16 to -2 to 2, centered at 0
# a20$ideo7_scaled <- (a20$ideo7_16 - 4) * (2 / 3)
#a20$ideo7_scaled <- scales::rescale(a20$ideo7_16, to = c(-2, 2))

# compute difference
# a20$ideology_discrepancy <- a20$nonresp_nchar_all_log - a20$ideo7_scaled



## ---- impute-missingness-2020 ----

# skipping racial_resent16, ft... due to post-wave drop out
# i.e., structural missingness
# 
# Subset of key variables with missingness + relevant predictors
# vars_for_impute <- c("income20", "age20", "educ20", "female20")
# vars_for_impute <- c("income20")

# a20 <- sjlabelled::remove_all_labels(a20)
# 
# Create subset for imputation
# imp_data <- a20[ , vars_for_impute]
 
# Run imputation with Predictive Mean Matching (PMM) — flexible + safe for non-normal data
# imp <- mice(imp_data, m = 1, method = "pmm", seed = 123)
 
# Get completed data
# imputed_df <- complete(imp, 1)
 
# Replace original variables (optional: rename with `_imp` suffix if you want to track changes)
# a20[ , vars_for_impute] <- imputed_df



# Amelia requires all variables to be numeric or factor
# amelia_out <- amelia(a20[ , vars_for_impute],
#                      m = 1,
#                      idvars = NULL, # specify if any variables shouldn't be imputed
#                      #noms = c("female16", "race16"), # categorical variables
#                      seed = 123)
# 
# # Extract imputed data
# a20[ , vars_for_impute] <- amelia_out$imputations[[1]]



# Model diagnostics moved to analysis Rmd
# a20 %>% glm(vote_validated20_bin ~ nchar_problems20_ihs + ..., family = binomial, data = .) %>% summary()

a20 <- a20 %>% haven::zap_labels()

save(a20, file = here::here("text_data_output", "anes2020_merged.Rdata"))


